import os
import glob
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import rc
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from IPython.display import display
from src import data_processing as dp
from src import sanky_chart as sc
from src import match_length as ml
from src import serving_analysis as sa
from src import mental_toughness as mt
from src import rising_star as rs
all_data = dp.read_gslam_files("data/atp_matches*.csv")
all_data.head()
| tourney_name | surface | tourney_level | winner_name | loser_name | score | best_of | round | minutes | w_ace | ... | w_bpSaved | w_bpFaced | l_ace | l_df | l_svpt | l_1stIn | l_1stWon | l_2ndWon | l_bpSaved | l_bpFaced | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Roland Garros | Clay | G | Rafael Nadal | Marcos Daniel | 7-5 6-4 6-3 | 5 | R128 | 143.0 | 3.0 | ... | 4.0 | 7.0 | 4.0 | 1.0 | 93.0 | 52.0 | 32.0 | 17.0 | 4.0 | 11.0 |
| 1 | Roland Garros | Clay | G | Teymuraz Gabashvili | Igor Kunitsyn | 6-7(6) 7-6(5) 6-3 6-1 | 5 | R128 | 191.0 | 12.0 | ... | 3.0 | 7.0 | 3.0 | 1.0 | 129.0 | 86.0 | 47.0 | 23.0 | 4.0 | 12.0 |
| 2 | Roland Garros | Clay | G | Andrey Golubev | Denis Gremelmayr | 6-3 6-3 6-2 | 5 | R128 | 98.0 | 7.0 | ... | 2.0 | 3.0 | 4.0 | 3.0 | 83.0 | 54.0 | 32.0 | 10.0 | 1.0 | 7.0 |
| 3 | Roland Garros | Clay | G | Lleyton Hewitt | Ivo Karlovic | 6-7(1) 6-7(4) 7-6(4) 6-4 6-3 | 5 | R128 | 236.0 | 19.0 | ... | 5.0 | 6.0 | 55.0 | 4.0 | 169.0 | 124.0 | 102.0 | 20.0 | 4.0 | 8.0 |
| 4 | Roland Garros | Clay | G | Robin Soderling | Kevin Kim | 7-6(4) 7-6(4) 6-2 | 5 | R128 | 137.0 | 11.0 | ... | 3.0 | 7.0 | 10.0 | 3.0 | 112.0 | 56.0 | 40.0 | 23.0 | 7.0 | 13.0 |
5 rows × 25 columns
Remove the big three from the data, and set the data frame to data_other.
data_other = dp.remove_player(["Roger Federer", "Rafael Nadal", "Novak Djokovic"], all_data, 'winner')
sum_other = data_other.sum()
data_other.head()
| tourney_name | surface | tourney_level | winner_name | loser_name | score | best_of | round | minutes | w_ace | ... | w_bpSaved | w_bpFaced | l_ace | l_df | l_svpt | l_1stIn | l_1stWon | l_2ndWon | l_bpSaved | l_bpFaced | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Roland Garros | Clay | G | Teymuraz Gabashvili | Igor Kunitsyn | 6-7(6) 7-6(5) 6-3 6-1 | 5 | R128 | 191.0 | 12.0 | ... | 3.0 | 7.0 | 3.0 | 1.0 | 129.0 | 86.0 | 47.0 | 23.0 | 4.0 | 12.0 |
| 2 | Roland Garros | Clay | G | Andrey Golubev | Denis Gremelmayr | 6-3 6-3 6-2 | 5 | R128 | 98.0 | 7.0 | ... | 2.0 | 3.0 | 4.0 | 3.0 | 83.0 | 54.0 | 32.0 | 10.0 | 1.0 | 7.0 |
| 3 | Roland Garros | Clay | G | Lleyton Hewitt | Ivo Karlovic | 6-7(1) 6-7(4) 7-6(4) 6-4 6-3 | 5 | R128 | 236.0 | 19.0 | ... | 5.0 | 6.0 | 55.0 | 4.0 | 169.0 | 124.0 | 102.0 | 20.0 | 4.0 | 8.0 |
| 4 | Roland Garros | Clay | G | Robin Soderling | Kevin Kim | 7-6(4) 7-6(4) 6-2 | 5 | R128 | 137.0 | 11.0 | ... | 3.0 | 7.0 | 10.0 | 3.0 | 112.0 | 56.0 | 40.0 | 23.0 | 7.0 | 13.0 |
| 5 | Roland Garros | Clay | G | Denis Istomin | Santiago Giraldo | 6-3 6-7(2) 6-3 7-6(4) | 5 | R128 | 205.0 | 21.0 | ... | 1.0 | 3.0 | 5.0 | 2.0 | 158.0 | 86.0 | 60.0 | 40.0 | 11.0 | 15.0 |
5 rows × 25 columns
data_federer = all_data[all_data['winner_name'] == "Roger Federer"].copy()
data_federer.drop([ 'winner_name' ], axis=1, inplace=True)
data_federer.head()
| tourney_name | surface | tourney_level | loser_name | score | best_of | round | minutes | w_ace | w_df | ... | w_bpSaved | w_bpFaced | l_ace | l_df | l_svpt | l_1stIn | l_1stWon | l_2ndWon | l_bpSaved | l_bpFaced | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 63 | Roland Garros | Clay | G | Alberto Martin | 6-4 6-3 6-2 | 5 | R128 | 103.0 | 10.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 3.0 | 87.0 | 56.0 | 34.0 | 13.0 | 5.0 | 10.0 |
| 95 | Roland Garros | Clay | G | Jose Acasuso | 7-6(8) 5-7 7-6(2) 6-2 | 5 | R64 | 205.0 | 11.0 | 0.0 | ... | 5.0 | 10.0 | 5.0 | 2.0 | 154.0 | 96.0 | 59.0 | 31.0 | 6.0 | 12.0 |
| 111 | Roland Garros | Clay | G | Paul Henri Mathieu | 4-6 6-1 6-4 6-4 | 5 | R32 | 166.0 | 14.0 | 3.0 | ... | 5.0 | 7.0 | 8.0 | 3.0 | 126.0 | 75.0 | 49.0 | 24.0 | 11.0 | 16.0 |
| 119 | Roland Garros | Clay | G | Tommy Haas | 6-7(4) 5-7 6-4 6-0 6-2 | 5 | R16 | 187.0 | 16.0 | 1.0 | ... | 3.0 | 5.0 | 11.0 | 3.0 | 145.0 | 93.0 | 63.0 | 28.0 | 8.0 | 15.0 |
| 123 | Roland Garros | Clay | G | Gael Monfils | 7-6(6) 6-2 6-4 | 5 | QF | 130.0 | 8.0 | 1.0 | ... | 4.0 | 4.0 | 7.0 | 1.0 | 105.0 | 61.0 | 41.0 | 22.0 | 2.0 | 5.0 |
5 rows × 24 columns
data_nadal = all_data[all_data['winner_name'] == "Rafael Nadal"].copy()
data_nadal.drop([ 'winner_name' ], axis=1, inplace=True)
data_nadal.head()
| tourney_name | surface | tourney_level | loser_name | score | best_of | round | minutes | w_ace | w_df | ... | w_bpSaved | w_bpFaced | l_ace | l_df | l_svpt | l_1stIn | l_1stWon | l_2ndWon | l_bpSaved | l_bpFaced | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Roland Garros | Clay | G | Marcos Daniel | 7-5 6-4 6-3 | 5 | R128 | 143.0 | 3.0 | 0.0 | ... | 4.0 | 7.0 | 4.0 | 1.0 | 93.0 | 52.0 | 32.0 | 17.0 | 4.0 | 11.0 |
| 64 | Roland Garros | Clay | G | Teymuraz Gabashvili | 6-1 6-4 6-2 | 5 | R64 | 137.0 | 2.0 | 1.0 | ... | 4.0 | 4.0 | 2.0 | 3.0 | 85.0 | 52.0 | 34.0 | 12.0 | 8.0 | 13.0 |
| 96 | Roland Garros | Clay | G | Lleyton Hewitt | 6-1 6-3 6-1 | 5 | R32 | 111.0 | 7.0 | 0.0 | ... | 2.0 | 3.0 | 2.0 | 2.0 | 75.0 | 46.0 | 26.0 | 9.0 | 6.0 | 13.0 |
| 301 | US Open | Hard | G | Richard Gasquet | 6-2 6-2 6-3 | 5 | R128 | 101.0 | 7.0 | 0.0 | ... | 0.0 | 0.0 | 4.0 | 4.0 | 75.0 | 49.0 | 29.0 | 12.0 | 5.0 | 10.0 |
| 341 | US Open | Hard | G | Nicolas Kiefer | 6-0 3-6 6-3 6-4 | 5 | R64 | 179.0 | 3.0 | 1.0 | ... | 1.0 | 3.0 | 7.0 | 5.0 | 133.0 | 54.0 | 38.0 | 33.0 | 9.0 | 15.0 |
5 rows × 24 columns
data_djoker = all_data[all_data['winner_name'] == "Novak Djokovic"].copy()
data_djoker.drop([ 'winner_name' ], axis=1, inplace=True)
data_djoker.head()
| tourney_name | surface | tourney_level | loser_name | score | best_of | round | minutes | w_ace | w_df | ... | w_bpSaved | w_bpFaced | l_ace | l_df | l_svpt | l_1stIn | l_1stWon | l_2ndWon | l_bpSaved | l_bpFaced | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 47 | Roland Garros | Clay | G | Nicolas Lapentti | 6-3 3-1 RET | 5 | R128 | 78.0 | 1.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 58.0 | 38.0 | 23.0 | 8.0 | 5.0 | 7.0 |
| 87 | Roland Garros | Clay | G | Sergiy Stakhovsky | 6-3 6-4 6-1 | 5 | R64 | 124.0 | 5.0 | 2.0 | ... | 3.0 | 5.0 | 6.0 | 3.0 | 107.0 | 62.0 | 38.0 | 16.0 | 11.0 | 18.0 |
| 174 | Wimbledon | Grass | G | Julien Benneteau | 6-7(8) 7-6(1) 6-2 6-4 | 5 | R128 | 209.0 | 14.0 | 4.0 | ... | 3.0 | 4.0 | 3.0 | 4.0 | 152.0 | 104.0 | 71.0 | 24.0 | 8.0 | 12.0 |
| 214 | Wimbledon | Grass | G | Simon Greul | 7-5 6-1 6-4 | 5 | R64 | 120.0 | 13.0 | 6.0 | ... | 6.0 | 10.0 | 0.0 | 2.0 | 87.0 | 53.0 | 26.0 | 16.0 | 7.0 | 15.0 |
| 234 | Wimbledon | Grass | G | Mardy Fish | 6-4 6-4 6-4 | 5 | R32 | 119.0 | 8.0 | 1.0 | ... | 0.0 | 1.0 | 6.0 | 1.0 | 96.0 | 56.0 | 42.0 | 16.0 | 8.0 | 12.0 |
5 rows × 24 columns
data_zverev = all_data[all_data['winner_name'] == "Alexander Zverev"].copy()
data_zverev.drop([ 'winner_name' ], axis=1, inplace=True)
data_zverev.head()
| tourney_name | surface | tourney_level | loser_name | score | best_of | round | minutes | w_ace | w_df | ... | w_bpSaved | w_bpFaced | l_ace | l_df | l_svpt | l_1stIn | l_1stWon | l_2ndWon | l_bpSaved | l_bpFaced | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1164 | Roland Garros | Clay | G | Pierre Hugues Herbert | 5-7 6-2 7-6(6) 7-5 | 5 | R128 | 188.0 | 8.0 | 6.0 | ... | 5.0 | 7.0 | 7.0 | 5.0 | 148.0 | 76.0 | 57.0 | 35.0 | 8.0 | 12.0 |
| 1217 | Roland Garros | Clay | G | Stephane Robert | 6-1 3-6 6-1 6-4 | 5 | R64 | 129.0 | 5.0 | 2.0 | ... | 5.0 | 8.0 | 11.0 | 7.0 | 100.0 | 48.0 | 35.0 | 20.0 | 5.0 | 12.0 |
| 1306 | Wimbledon | Grass | G | Paul Henri Mathieu | 6-3 6-4 6-2 | 5 | R128 | 96.0 | 20.0 | 2.0 | ... | 5.0 | 7.0 | 3.0 | 8.0 | 86.0 | 48.0 | 26.0 | 19.0 | 3.0 | 9.0 |
| 1352 | Wimbledon | Grass | G | Mikhail Youzhny | 6-4 3-6 6-0 4-6 6-2 | 5 | R64 | 198.0 | 10.0 | 10.0 | ... | 7.0 | 11.0 | 3.0 | 2.0 | 137.0 | 90.0 | 55.0 | 22.0 | 4.0 | 12.0 |
| 1441 | US Open | Hard | G | Daniel Brands | 3-6 6-1 6-4 7-6(4) | 5 | R128 | 155.0 | 4.0 | 7.0 | ... | 2.0 | 4.0 | 7.0 | 9.0 | 126.0 | 70.0 | 49.0 | 27.0 | 8.0 | 12.0 |
5 rows × 24 columns
data_thiem = all_data[all_data['winner_name'] == "Dominic Thiem"].copy()
data_thiem.drop([ 'winner_name' ], axis=1, inplace=True)
data_thiem.head()
| tourney_name | surface | tourney_level | loser_name | score | best_of | round | minutes | w_ace | w_df | ... | w_bpSaved | w_bpFaced | l_ace | l_df | l_svpt | l_1stIn | l_1stWon | l_2ndWon | l_bpSaved | l_bpFaced | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1036 | Australian Open | Hard | G | Leonardo Mayer | 6-2 7-6 4-6 7-6 | 5 | R128 | 186.0 | 7.0 | 4.0 | ... | 3.0 | 6.0 | 8.0 | 5.0 | 139.0 | 91.0 | 61.0 | 24.0 | 5.0 | 9.0 |
| 1090 | Australian Open | Hard | G | Nicolas Almagro | 6-3 6-1 6-3 | 5 | R64 | 101.0 | 5.0 | 1.0 | ... | 0.0 | 0.0 | 13.0 | 4.0 | 100.0 | 56.0 | 33.0 | 19.0 | 11.0 | 17.0 |
| 1166 | Roland Garros | Clay | G | Inigo Cervantes Huegun | 3-6 6-2 7-5 6-1 | 5 | R128 | 158.0 | 5.0 | 1.0 | ... | 6.0 | 9.0 | 6.0 | 4.0 | 118.0 | 69.0 | 44.0 | 19.0 | 10.0 | 17.0 |
| 1218 | Roland Garros | Clay | G | Guillermo Garcia Lopez | 7-5 6-4 7-6(3) | 5 | R64 | 145.0 | 7.0 | 7.0 | ... | 6.0 | 9.0 | 5.0 | 4.0 | 112.0 | 52.0 | 37.0 | 28.0 | 7.0 | 12.0 |
| 1244 | Roland Garros | Clay | G | Alexander Zverev | 6-7(4) 6-3 6-3 6-3 | 5 | R32 | 169.0 | 6.0 | 2.0 | ... | 7.0 | 8.0 | 6.0 | 5.0 | 121.0 | 72.0 | 49.0 | 22.0 | 10.0 | 15.0 |
5 rows × 24 columns
#data taken for Sankey diagram is for top 20 players, so that the visualization is clear
tgs = len(all_data) #total grand slam matches played
sanky_data = all_data.groupby('winner_name',as_index=False).size().sort_values('size', ascending=False).head(20) #data for sanky diagram
sanky_data
| winner_name | size | |
|---|---|---|
| 428 | Roger Federer | 324 |
| 377 | Novak Djokovic | 284 |
| 407 | Rafael Nadal | 279 |
| 50 | Andy Murray | 182 |
| 104 | David Ferrer | 143 |
| 495 | Tomas Berdych | 140 |
| 459 | Stan Wawrinka | 140 |
| 51 | Andy Roddick | 116 |
| 243 | Jo-Wilfried Tsonga | 115 |
| 317 | Marin Cilic | 115 |
| 151 | Fernando Verdasco | 110 |
| 164 | Gael Monfils | 108 |
| 288 | Lleyton Hewitt | 100 |
| 417 | Richard Gasquet | 98 |
| 257 | Juan Martin del Potro | 97 |
| 273 | Kei Nishikori | 95 |
| 499 | Tommy Robredo | 93 |
| 353 | Mikhail Youzhny | 88 |
| 147 | Feliciano Lopez | 86 |
| 354 | Milos Raonic | 84 |
sanky_data['total_grand_slams'] = 'Grand slam matches from 2003 to 2020: ' + str(+ tgs)
sanky_data["winner_name"] = sanky_data["winner_name"] +": "+ (sanky_data['size']).astype(str)
sanky_data
| winner_name | size | total_grand_slams | |
|---|---|---|---|
| 428 | Roger Federer: 324 | 324 | Grand slam matches from 2003 to 2020: 8732 |
| 377 | Novak Djokovic: 284 | 284 | Grand slam matches from 2003 to 2020: 8732 |
| 407 | Rafael Nadal: 279 | 279 | Grand slam matches from 2003 to 2020: 8732 |
| 50 | Andy Murray: 182 | 182 | Grand slam matches from 2003 to 2020: 8732 |
| 104 | David Ferrer: 143 | 143 | Grand slam matches from 2003 to 2020: 8732 |
| 495 | Tomas Berdych: 140 | 140 | Grand slam matches from 2003 to 2020: 8732 |
| 459 | Stan Wawrinka: 140 | 140 | Grand slam matches from 2003 to 2020: 8732 |
| 51 | Andy Roddick: 116 | 116 | Grand slam matches from 2003 to 2020: 8732 |
| 243 | Jo-Wilfried Tsonga: 115 | 115 | Grand slam matches from 2003 to 2020: 8732 |
| 317 | Marin Cilic: 115 | 115 | Grand slam matches from 2003 to 2020: 8732 |
| 151 | Fernando Verdasco: 110 | 110 | Grand slam matches from 2003 to 2020: 8732 |
| 164 | Gael Monfils: 108 | 108 | Grand slam matches from 2003 to 2020: 8732 |
| 288 | Lleyton Hewitt: 100 | 100 | Grand slam matches from 2003 to 2020: 8732 |
| 417 | Richard Gasquet: 98 | 98 | Grand slam matches from 2003 to 2020: 8732 |
| 257 | Juan Martin del Potro: 97 | 97 | Grand slam matches from 2003 to 2020: 8732 |
| 273 | Kei Nishikori: 95 | 95 | Grand slam matches from 2003 to 2020: 8732 |
| 499 | Tommy Robredo: 93 | 93 | Grand slam matches from 2003 to 2020: 8732 |
| 353 | Mikhail Youzhny: 88 | 88 | Grand slam matches from 2003 to 2020: 8732 |
| 147 | Feliciano Lopez: 86 | 86 | Grand slam matches from 2003 to 2020: 8732 |
| 354 | Milos Raonic: 84 | 84 | Grand slam matches from 2003 to 2020: 8732 |
data, layout = sc.sanky_chart_data(sanky_data)
fig = go.Figure(data=[data], layout=layout)
fig.show()
# Fetch winning and losing match length for Roger Federer
lose_federer, win_federer = ml.player_match_length("Roger Federer", data_other, data_federer)
# Fetch winning and losing match length for Rafael Nadal
lose_nadal, win_nadal = ml.player_match_length("Rafael Nadal", data_other, data_nadal)
# Fetch winning and losing match length for Novak Djokovic
lose_djoker, win_djoker = ml.player_match_length("Novak Djokovic", data_other, data_djoker)
# Fetch winning and losing match length for other players
win_other = data_other[data_other['minutes'] <= 400]
win_other.insert(0, column = "Result", value = ['Win']*len(win_other))
win_other.insert(0, column = "Name", value = ["Other"]*len(win_other))
lose_other = dp.remove_player(["Roger Federer", "Rafael Nadal", "Novak Djokovic"], all_data, 'loser')
lose_other = lose_other[lose_other['minutes'] <= 400]
lose_other.insert(0, column = "Result", value = ['Lose']*len(lose_other))
lose_other.insert(0, column = "Name", value = ["Other"]*len(lose_other))
violin_data = pd.concat([win_federer, lose_federer, win_nadal, lose_nadal, win_djoker, lose_djoker, win_other, lose_other])
violin_data = violin_data.rename(columns={"minutes": "Minutes"}, errors="raise")
sns.set(style="darkgrid")
sns.violinplot(x="Name", y="Minutes", hue="Result", data=violin_data, palette="Pastel1", split=True).set(title='Match Length of Big 3 and Other Players')
plt.show()
data_federer, sum_federer = sa.player_data("Roger Federer", all_data)
data_nadal, sum_nadal = sa.player_data("Rafael Nadal", all_data)
data_djoker, sum_djoker = sa.player_data("Novak Djokovic", all_data)
#rising star match length t Alexander Zverev Dominic Thiem
data_zverev, sum_zverev = sa.player_data("Alexander Zverev", all_data)
data_thiem, sum_thiem = sa.player_data("Dominic Thiem", all_data)
# Counting number of ace point played by every winning player
ace_winner = sa.player_group_data(all_data, 'w_ace')
# Counting number of firstserve played by every winning player
firstserve_winner = sa.player_group_data(all_data, 'w_1stWon')
# Counting number of secondserve played by every winning player
secondserve_winner = sa.player_group_data(all_data, 'w_2ndWon')
# Counting number of double faults by every player
df_winner = all_data[['winner_name', 'w_df']].groupby('winner_name').sum()
df_winner = pd.DataFrame(df_winner)
df_winner.head()
| w_df | |
|---|---|
| winner_name | |
| Adam Pavlasek | 10.0 |
| Adrian Mannarino | 95.0 |
| Adrian Menendez Maceiras | 1.0 |
| Adrian Ungur | 6.0 |
| Adrian Voinea | 3.0 |
# Data
r = [0,1,2,3]
ff, fs, fd = sa.serving_data(sum_federer)
nf, ns, nd = sa.serving_data(sum_nadal)
df, ds, dd = sa.serving_data(sum_djoker)
of, os, od = sa.serving_data(sum_other)
tf, ts, td = sa.serving_data(sum_thiem)
zf, zs, zd = sa.serving_data(sum_zverev)
score_Federer_serving = sa.score_serving(ff,fs,fd)
score_Nadal_serving = sa.score_serving(nf,ns,nd)
score_Djoker_serving = sa.score_serving(df,ds,dd)
score_Other_serving = sa.score_serving(of,os,od)
score_Zverev_serving = sa.score_serving(zf,zs,zd)
score_Thiem_serving = sa.score_serving(tf,ts,td)
print(score_Zverev_serving, score_Thiem_serving)
raw_data = {'greenBars': [ff, nf, df, of], 'orangeBars': [fs, ns, ds,os], 'blueBars': [fd, nd, dd, od]}
df = pd.DataFrame(raw_data)
# From raw value to percentage
totals = [i+j+k for i,j,k in zip(df['greenBars'], df['orangeBars'], df['blueBars'])]
greenBars = [i / j * 100 for i,j in zip(df['greenBars'], totals)]
orangeBars = [i / j * 100 for i,j in zip(df['orangeBars'], totals)]
blueBars = [i / j * 100 for i,j in zip(df['blueBars'], totals)]
# plot
barWidth = 0.85
names = ('Federer','Nadal','Djokovic','Others')
# Create green Bars
plt.bar(r, greenBars, color='#b5ffb9', edgecolor='white', width=barWidth, label="First serve win")
# Create orange Bars
plt.bar(r, orangeBars, bottom=greenBars, color='#f9bc86', edgecolor='white', width=barWidth, label="Second serve win")
# Create blue Bars
plt.bar(r, blueBars, bottom=[i+j for i,j in zip(greenBars, orangeBars)], color='#a3acff', edgecolor='white', width=barWidth, label="Double fault")
# Custom x axis
plt.xticks(r, names)
plt.xlabel("Name")
plt.ylabel("Percent of all serves")
plt.title("Serve proportion")
# Add a legend
plt.legend(loc='upper left', bbox_to_anchor=(1,1), ncol=1)
# Show graphic
plt.show()
74.35768623510836 76.99489387264717
fig, axis = plt.subplots(figsize=(15,10))
# Grid lines, Xticks, Xlabel, Ylabel
axis.yaxis.grid(True)
axis.xaxis.grid(True)
axis.set_title('Correlation between ace and double fault',fontsize=25, pad=25.0)
axis.set_xlabel('Double faults per Serve Point(%)',fontsize=20, labelpad= 25.0)
axis.set_ylabel('Aces per Serve Point(%)',fontsize=20, labelpad=25.0)
ace_data = (ace_winner['w_ace']).values.reshape(-1, 1)
df_data = (df_winner['w_df']).values.reshape(-1, 1)
firstserve_win = firstserve_winner['w_1stWon'].values.reshape(-1, 1)
secondserve_win = secondserve_winner['w_2ndWon'].values.reshape(-1, 1)
X = df_data/(firstserve_win+secondserve_win+df_data)*100
Y = ace_data/(firstserve_win+secondserve_win+df_data)*100
linear_regressor_one = LinearRegression() # create object for the class
linear_regressor_one.fit(X, Y) # perform linear regression
Y_pred = linear_regressor_one.predict(X) # make predictions
plt.plot(X, Y_pred, color='red')
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
axis.scatter(X, Y)
plt.scatter (*sa.serve_percentage(sum_federer), s=200, marker = 's', color='r', label = 'Roger Federer')
plt.scatter (*sa.serve_percentage(sum_nadal), s=200, marker = 's', color='g', label = 'Novak Djokovic')
plt.scatter (*sa.serve_percentage(sum_djoker), s=200, marker = 's', color='y', label = 'Rafael Nadal')
plt.legend(prop={'size': 20})
<matplotlib.legend.Legend at 0x7fa74450a130>
# Group the data by winner name and add all the break points
mental_df = all_data[['winner_name', 'w_bpSaved', 'w_bpFaced']].groupby('winner_name').sum()
# Add a percentage collumn with the breakpoint percentage
mental_df['percentage'] = mental_df.apply(lambda x: mt.get_percentage(x['w_bpSaved'], x['w_bpFaced']), axis=1)
# Add a mental points collumn with the mental point scores
mt.add_mental_points_col(mental_df, list(mental_df.index), all_data)
# Plot the scatterplot
fig, ax = plt.subplots(figsize=(8,6))
ax.scatter(mental_df['mental_score'], mental_df['percentage'], s=20)
ax.set_title("Break Point Win Percentage vs. Mental Point Score")
ax.set_xlabel('Mental Point Score')
ax.set_ylabel('Break Point Win Percentage')
mt.annotate_plot(ax, mental_df, ["Roger Federer", "Novak Djokovic", "Rafael Nadal"], ["r", "g", "y"])
# plt.scatter (2.57,15.253, s=200, marker = 's', color='r', label = 'Federer')
# plt.scatter (2.94, 6.6634, s=200, marker = 's', color='g', label = 'Nadal')
# plt.scatter (3.6525, 10.563, s=200, marker = 's', color='y', label = 'Djokovic')
plt.legend(prop={'size': 15})
plt.show()
mental_df = mental_df.sort_values('mental_score',ascending=False)
pd.set_option('display.max_rows',525)
mental_df.head(30)
| w_bpSaved | w_bpFaced | percentage | mental_score | |
|---|---|---|---|---|
| winner_name | ||||
| Novak Djokovic | 1046.0 | 1538.0 | 68.01 | 94 |
| Roger Federer | 972.0 | 1366.0 | 71.16 | 87 |
| Stan Wawrinka | 653.0 | 927.0 | 70.44 | 85 |
| Marin Cilic | 478.0 | 674.0 | 70.92 | 67 |
| Andy Murray | 746.0 | 1118.0 | 66.73 | 65 |
| Feliciano Lopez | 367.0 | 517.0 | 70.99 | 62 |
| Rafael Nadal | 1026.0 | 1444.0 | 71.05 | 60 |
| John Isner | 220.0 | 283.0 | 77.74 | 59 |
| Gael Monfils | 459.0 | 714.0 | 64.29 | 57 |
| Fernando Verdasco | 583.0 | 845.0 | 68.99 | 57 |
| Kei Nishikori | 442.0 | 688.0 | 64.24 | 57 |
| Andreas Seppi | 265.0 | 415.0 | 63.86 | 55 |
| Janko Tipsarevic | 256.0 | 380.0 | 67.37 | 50 |
| Lleyton Hewitt | 453.0 | 683.0 | 66.33 | 49 |
| Nicolas Almagro | 259.0 | 378.0 | 68.52 | 49 |
| David Ferrer | 589.0 | 920.0 | 64.02 | 48 |
| Gilles Muller | 219.0 | 283.0 | 77.39 | 46 |
| Jo-Wilfried Tsonga | 430.0 | 589.0 | 73.01 | 46 |
| Tommy Robredo | 441.0 | 653.0 | 67.53 | 46 |
| Viktor Troicki | 287.0 | 413.0 | 69.49 | 45 |
| Gilles Simon | 464.0 | 671.0 | 69.15 | 44 |
| Jarkko Nieminen | 336.0 | 498.0 | 67.47 | 43 |
| Fabio Fognini | 278.0 | 441.0 | 63.04 | 42 |
| Tomas Berdych | 528.0 | 709.0 | 74.47 | 40 |
| Kevin Anderson | 243.0 | 340.0 | 71.47 | 40 |
| Tommy Haas | 275.0 | 415.0 | 66.27 | 40 |
| Philipp Kohlschreiber | 323.0 | 426.0 | 75.82 | 37 |
| Juan Carlos Ferrero | 314.0 | 476.0 | 65.97 | 37 |
| Andy Roddick | 341.0 | 446.0 | 76.46 | 37 |
| Mikhail Youzhny | 457.0 | 668.0 | 68.41 | 36 |
# Calculate the difference of losing and winning match length for different players
dif_Federer = rs.match_len_diff(lose_federer, win_federer)
dif_Nadal = rs.match_len_diff(lose_nadal, win_nadal)
dif_Djoker = rs.match_len_diff(lose_djoker, win_djoker)
dif_Other = rs.match_len_diff(lose_other, win_other)
print(dif_Federer,dif_Nadal,dif_Djoker,dif_Other)
group_win = all_data.groupby(['winner_name'])
group_lose = all_data.groupby(['loser_name'])
dif = []
for name in group_win.groups.keys():
player_win = group_win.get_group(name)
win_length = player_win['minutes'].mean()
player_lose = group_lose.get_group(name)
lose_length = player_lose['minutes'].mean()
dif.append(lose_length - win_length)
dif.sort(reverse = True)
dif_Largest = max(dif)
dif_Least = min(dif)
# Calculate the scores of match length for different players
score_Federer_length = rs.match_length_score(dif_Federer, dif_Least, dif_Largest)
score_Nadal_length = rs.match_length_score(dif_Nadal, dif_Least, dif_Largest)
score_Djoker_length = rs.match_length_score(dif_Djoker, dif_Least, dif_Largest)
score_Other_length = rs.match_length_score(dif_Other, dif_Least, dif_Largest)
59.29629629629629 30.856630824372758 29.57163671685285 -1.7450049915708803
score_Federer_mental = mental_df.loc['Roger Federer','mental_score']
score_Djoker_mental = mental_df.loc['Novak Djokovic','mental_score']
score_Nadal_mental = mental_df.loc['Rafael Nadal','mental_score']
score_Zverev_mental = mental_df.loc['Alexander Zverev','mental_score']
score_Thiem_mental = mental_df.loc['Dominic Thiem','mental_score']
mental_df_other = mental_df.drop(['Roger Federer','Novak Djokovic','Rafael Nadal'])
score_Other_mental = mental_df_other['mental_score'].mean()
categories = ['Serving Skills','Match Length','Mental Toughness','Serving Skills']
fig = go.Figure()
fig.add_trace(go.Scatterpolar(
r=[score_Other_serving, score_Other_length,score_Other_mental,score_Other_serving],
theta=categories,
fill=None,
name='Other player average'
))
fig.update_layout(
title=dict(
text="Other player average",
y=0.95,
x=0.5,
xanchor= 'center',
yanchor= 'top',
font=dict(size=20)
),
polar=dict(
radialaxis=dict(
visible=True,
range=[0, 100]
)),
showlegend=True
)
fig.show()
categories = ['Serving Skills','Match Length','Mental Toughness','Serving Skills']
fig = go.Figure()
fig.add_trace(go.Scatterpolar(
r=[score_Federer_serving, score_Federer_length,score_Federer_mental,score_Federer_serving],
theta=categories,
fill=None,
name='Federer'
))
fig.add_trace(go.Scatterpolar(
r=[score_Djoker_serving, score_Djoker_length,score_Djoker_mental,score_Djoker_serving],
theta=categories,
fill=None,
name='Djokovic'
))
fig.add_trace(go.Scatterpolar(
r=[score_Nadal_serving, score_Nadal_length,score_Nadal_mental,score_Nadal_serving],
theta=categories,
fill=None,
name='Nadal'
))
fig.update_layout(
title=dict(
text="Big 3",
y=0.95,
x=0.5,
xanchor= 'center',
yanchor= 'top',
font=dict(size=20)
),
polar=dict(
radialaxis=dict(
visible=True,
range=[50, 100]
)),
showlegend=True
)
fig.show()
average_Big3_serving = (score_Federer_serving + score_Nadal_serving + score_Djoker_serving)/3
average_Big3_length = (score_Federer_length + score_Nadal_length + score_Djoker_length)/3
average_Big3_mental = (score_Federer_mental + score_Nadal_mental + score_Djoker_mental)/3
categories = ['Serving Skills','Match Length','Mental Toughness','Serving Skills']
fig = go.Figure()
fig.add_trace(go.Scatterpolar(
r=[average_Big3_serving, average_Big3_length,average_Big3_mental,average_Big3_serving],
theta=categories,
fill=None,
name='Big 3 average'
))
fig.add_trace(go.Scatterpolar(
r=[score_Other_serving, score_Other_length,score_Other_mental,score_Other_serving],
theta=categories,
fill=None,
name='Other player average'
))
fig.update_layout(
title=dict(
text="Big 3 average VS Other player average",
y=0.95,
x=0.5,
xanchor= 'center',
yanchor= 'top',
font=dict(size=20)
),
polar=dict(
radialaxis=dict(
visible=True,
range=[0, 100]
)),
showlegend=True
)
fig.show()
#rising start Alexander Zverev Dominic Thiem
#rising star match length Alexander Zverev Dominic Thiem
lose_zverev, win_zverev = ml.player_match_length("Alexander Zverev", data_other, data_zverev)
lose_thiem, win_thiem = ml.player_match_length("Dominic Thiem", data_other, data_thiem)
dif_Zverev = rs.match_len_diff(lose_zverev, win_zverev)
dif_Thiem = rs.match_len_diff(lose_thiem, win_thiem)
score_Zverev_length = rs.match_length_score(dif_Zverev, dif_Least, dif_Largest)
score_Thiem_length = rs.match_length_score(dif_Thiem, dif_Least, dif_Largest)
print(score_Zverev_length, score_Thiem_length)
70.6643932935493 67.38765071245889
categories = ['Serving Skills','Match Length','Mental Toughness','Serving Skills']
fig = go.Figure()
fig.add_trace(go.Scatterpolar(
r=[average_Big3_serving, average_Big3_length,average_Big3_mental,average_Big3_serving],
theta=categories,
fill=None,
name='Big 3 average'
))
fig.add_trace(go.Scatterpolar(
r=[score_Zverev_serving, score_Zverev_length,score_Zverev_mental,score_Zverev_serving],
theta=categories,
fill=None,
name='Zverev'
))
fig.add_trace(go.Scatterpolar(
r=[score_Thiem_serving, score_Thiem_length,score_Thiem_mental,score_Thiem_serving],
theta=categories,
fill=None,
name='Thiem'
))
fig.add_trace(go.Scatterpolar(
r=[score_Other_serving, score_Other_length,score_Other_mental,score_Other_serving],
theta=categories,
fill=None,
name='Other player average',
marker={'color':'#FFA15A'}
))
fig.update_layout(
title=dict(
text="Big 3 average VS Rising Star VS Other player average",
y=0.95,
x=0.5,
xanchor= 'center',
yanchor= 'top',
font=dict(size=20)
),
polar=dict(
radialaxis=dict(
visible=True,
range=[0, 85]
)),
showlegend=True
)
fig.show()
categories = ['Serving Skills','Match Length','Mental Toughness','Serving Skills']
fig = go.Figure()
fig.add_trace(go.Scatterpolar(
r=[score_Federer_serving, score_Federer_length,score_Federer_mental,score_Federer_serving],
theta=categories,
fill=None,
name='Federer'
))
fig.add_trace(go.Scatterpolar(
r=[score_Djoker_serving, score_Djoker_length,score_Djoker_mental,score_Djoker_serving],
theta=categories,
fill=None,
name='Djokovic'
))
fig.add_trace(go.Scatterpolar(
r=[score_Nadal_serving, score_Nadal_length,score_Nadal_mental,score_Nadal_serving],
theta=categories,
fill=None,
name='Nadal'
))
fig.add_trace(go.Scatterpolar(
r=[score_Zverev_serving, score_Zverev_length,score_Zverev_mental,score_Zverev_serving],
theta=categories,
fill=None,
name='Zverev',
marker={'color':'#7F7F7F'}
))
fig.add_trace(go.Scatterpolar(
r=[score_Thiem_serving, score_Thiem_length,score_Thiem_mental,score_Thiem_serving],
theta=categories,
fill=None,
name='Thiem'
))
fig.update_layout(
title=dict(
text="Big 3 VS Rising Star",
y=0.95,
x=0.5,
xanchor= 'center',
yanchor= 'top',
font=dict(size=20)
),
polar=dict(
radialaxis=dict(
visible=True,
range=[0,100]
)),
showlegend=True
)
fig.show()
ranking_data = dp.read_ranking_files("data/atp_rankings*.csv")
ranking_data
| ranking_date | rank | player | points | |
|---|---|---|---|---|
| 0 | 20220103 | 1 | 104925 | 11540.0 |
| 1 | 20220103 | 2 | 106421 | 8640.0 |
| 2 | 20220103 | 3 | 100644 | 7840.0 |
| 3 | 20220103 | 4 | 126774 | 6540.0 |
| 4 | 20220103 | 5 | 126094 | 5150.0 |
| ... | ... | ... | ... | ... |
| 915613 | 20191230 | 1922 | 134833 | 1.0 |
| 915614 | 20191230 | 1922 | 144856 | 1.0 |
| 915615 | 20191230 | 1922 | 202326 | 1.0 |
| 915616 | 20191230 | 1926 | 207307 | 1.0 |
| 915617 | 20191230 | 1927 | 208186 | 1.0 |
1131389 rows × 4 columns
#100644 Alexander Zverev
#106233 Dominic Thiem
#103819 Federer
#104745 Nadal
#104925 Djokovic
ranking = ranking_data[(ranking_data['player'].isin([100644,106233,103819,104745,104925]))]
ranking = ranking.sort_values(by=['ranking_date'])
ranking['ranking_date'] = pd.to_datetime(ranking['ranking_date'], format='%Y%m%d')
ranking = ranking.replace(100644,'Zverev')
ranking = ranking.replace(106233,'Thiem')
ranking = ranking.replace(103819,'Federer')
ranking = ranking.replace(104745,'Nadal')
ranking = ranking.replace(104925,'Djokovic')
ranking
| ranking_date | rank | player | points | |
|---|---|---|---|---|
| 0 | 2010-01-04 | 1 | Federer | 10550.0 |
| 1 | 2010-01-04 | 2 | Nadal | 9205.0 |
| 2 | 2010-01-04 | 3 | Djokovic | 8310.0 |
| 1813 | 2010-01-11 | 2 | Nadal | 9310.0 |
| 1812 | 2010-01-11 | 1 | Federer | 10550.0 |
| ... | ... | ... | ... | ... |
| 61916 | 2022-08-29 | 3 | Nadal | 5630.0 |
| 63848 | 2022-09-12 | 3 | Nadal | 5810.0 |
| 63850 | 2022-09-12 | 5 | Zverev | 5040.0 |
| 63852 | 2022-09-12 | 7 | Djokovic | 3570.0 |
| 65109 | 2022-09-12 | 216 | Thiem | 241.0 |
2703 rows × 4 columns
fig = px.line(ranking, x="ranking_date", y="rank", color="player")
fig.update_yaxes(autorange="reversed")
fig.update_layout(
title=dict(
text="Big 3 VS Rising Star Ranking",
y=0.95,
x=0.5,
xanchor= 'center',
yanchor= 'top',
font=dict(size=20)
),
)
fig.show()